##########################################################
# John Henderson and Alex Theodoridis
# Replication Data for: "Seeing Spots", 
#  Forthcoming in Political Behavior, August 20, 2017
# 
##########################################################
#
#  pre_data.R
#  -- file produces outcome and treatment obejcts prior to analysis
#
##########################################################

# Outcomes 
#  - all_y - summary of four info-seeking choices (1-video_skipped,replay,share,getlink)
#  - video_skipped - skip video or not
#  - replay - replay video or not                           
#  - share - share wit friend
#  - getlink - link to see videos like this 
#  - time_watched - total time watched first pass
#  - total_time

#  - obama_pos, obama_neg, romney_pos, romney_neg, neg_ad, pos_ad, obama_ad, romney_ad
#  - pid3, pid7, pid_lean, tr, vids

rm(list=ls())
library(foreign)
library(stringr)

cces_data=read.csv(file='~/Dropbox/Seeing_Spots/replication/cces_data_pres.csv',header=T,stringsAsFactors=F)  

# skipped @ 30 secs               
cces_data$video_skipped=array(NA,length(cces_data[[1]]))
cces_data$video_skipped[which(cces_data$video_time<30)]=1
cces_data$video_skipped[which(cces_data$video_time>30)]=0

# treatment measures
tr=cces_data$videos_treat
pos_ad=obama_ad=romney_ad=obama_pos=obama_neg=romney_pos=romney_neg=neg_ad=array(NA,length(tr))

pos_ad = str_sub(tr,1,3) == "Pos"
neg_ad = str_sub(tr,1,3) == "Neg"

obama_ad = str_sub(tr,10,12) == "Oba"
romney_ad = str_sub(tr,10,12) == "Rom"

obama_pos = pos_ad == T & obama_ad == T
obama_neg = neg_ad == T & obama_ad == T
romney_pos = pos_ad == T & romney_ad == T
romney_neg = neg_ad == T & romney_ad == T

tr_full=0*obama_pos+1*obama_neg+2*romney_pos+3*romney_neg

# pid_lean is cces_data$pid3lean
# pid7 is cces_data$pid7zero  
pid3 = as.numeric(cces_data$pid3lean)
pid3[which(pid3==1)] = -1
pid3[which(pid3==2)] = 0
pid3[which(pid3==3)] = 1
pid3[which(pid3>1)] = NA

pid_lean = pid3
table(pid_lean)

# political interest measures 

sen_majority_correct=hou_majority_correct = array(NA,length(cces_data$CC309a))
hou_majority_correct[as.numeric(cces_data$CC309a)==1] = T
hou_majority_correct[as.numeric(cces_data$CC309a)!=1] = F

sen_majority_correct[as.numeric(cces_data$CC309b)==2] = T
sen_majority_correct[as.numeric(cces_data$CC309b)!=2] = F

majority = hou_majority_correct + sen_majority_correct

newsint = as.numeric(cces_data$newsint)
newsint[as.numeric(cces_data$newsint)>4] = 4
newsint = -(newsint - 4)


#
#Name:         newsint         
#Description:  Interest in news and public affairs
#
#        Count Code Label
#        ----- ---- -----
#          569    1 Most of the time
#          243    2 Some of the time
#          115    3 Only now and then
#           53    4 Hardly at all
#           18    7 Don't know
#            2    8 Skipped
#            0    9 Not Asked


# outcomes
# skipping and video time

# not skipping is if video time > 30 
# skipping is if video time <= 30
# trim time if took longer than 60 seconds [not sure these should be NA and not 0 on skipping]

# time watched 
time_watched = as.numeric(cces_data$video_time)
time_watched[which(time_watched<0)] = NA
time_float1 = time_watched
time_watched[which(time_watched > 60)] = NA      

# alternative is skipping before 25 ... last 5 seconds are often immaterial ...
video_skipped = cces_data$video_skipped
video_skipped_alt = cces_data$video_skipped
video_skipped_alt[which(video_skipped==1 & as.numeric(cces_data$video_time)>=25)] = 0
video_skipped[is.na(time_watched)]=NA  
video_skipped_alt[is.na(time_watched)]=NA                

# proportion of ads skipped by type:          
skipads=array(0,12)                            
for(j in 1:12){
	skipads[j]=mean(video_skipped[as.numeric(tr)==j],na.rm=T)         
}


# time watched 
time_watched = as.numeric(cces_data$video_time)
time_watched[which(time_watched<0)] = NA
time_float1 = time_watched
time_watched[which(time_watched > 60)] = NA

time_again = as.numeric(cces_data$video_time2)
time_again[which(time_again<0)] = NA
time_float2 = time_again
time_again[which(time_again > 60)] = NA

total_time = time_watched + time_again
total_float = time_float1 + time_float2

# replay is given not skipped; watch again
replay = as.numeric(cces_data$videoagain) #replay)
replay[which(replay>2)] = NA
replay[which(replay==2)] = 0

# share is share with friend, etc
share = as.numeric(cces_data$AGT309)
share[which(share>2)] = NA
share[which(share==2)] = 0

# getlink is ask for a video link
getlink = as.numeric(cces_data$AGT310)
getlink[which(getlink>2)] = NA
getlink[which(getlink==2)] = 0
    
# additive scale of 4 information seeking items 
all_y=cbind(1-video_skipped,share,replay,getlink)
all_y=rowSums(all_y,na.rm=T)
all_y[which(is.na(video_skipped) & is.na(replay) & is.na(share) & is.na(getlink))]=NA
#all_y=all_y/max(all_y,na.rm=T)
      
# reliability    
do.reliability=F
if(do.reliability==T){
	library(psych)
	rel_y=cbind(1-video_skipped,share,replay,getlink)  
	alpha(rel_y[obama_ad,]) 

	# about .49
	set.seed(1005)                      
	for(k in 1:ncol(rel_y)){
		rel_y[which(is.na(rel_y[,k])),k]=sample(c(0,1),prob=table(rel_y[which(!is.na(rel_y[,k])),k])/sum(table(rel_y[which(!is.na(rel_y[,k])),k])),size=length(rel_y[which(is.na(rel_y[,k])),k]),replace=T)
	}
           
	distMat=dist(rel_y)    
	all_sum=cmdscale(distMat,k=1)   
	cor(all_sum,rel_y) 
	# dimensionality dominated by not-skipping outcome
}

# END pre_data.R    